package com.meiyuetao.myt.crawl.filter;

import java.util.Map;
import java.util.Set;

import lab.s2jh.core.exception.ServiceException;
import lab.s2jh.crawl.filter.ParseFilterChain;
import lab.s2jh.crawl.service.HtmlunitService;

import org.apache.commons.lang3.StringUtils;
import org.joda.time.DateTime;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.gargoylesoftware.htmlunit.html.HtmlElement;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import com.meiyuetao.myt.crawl.entity.ParseCommodity;
import com.meiyuetao.myt.md.entity.Commodity;

public class TMallSingleParseFilter extends AbstractCommodityParseFilter {

    private static final Logger logger = LoggerFactory.getLogger(TMallSingleParseFilter.class);

    private static final String[] ALERT_DOMAIN_INFOS = { "taobao.com", "tmall.com", "tbcdn.cn" };

    private static final int TMALL_KISSY_WAIT_SECONDS = 60;

    protected HtmlPage fetchHtmlPage(String url) {
        try {
            HtmlPage page = HtmlunitService.buildWebClient().getPage(url);
            try {
                int i = 0;
                boolean ok = false;
                while (i++ < TMALL_KISSY_WAIT_SECONDS) {
                    // 等待几秒待JS后台执行处理
                    logger.debug("Sleep " + i + " seconds to wait KISSY execution...");
                    Thread.sleep(1000);
                    String xml = page.asXml();
                    if (xml.indexOf("描述加载中") == -1) {
                        ok = true;
                        logger.debug("Parse page description success for: {}", url);
                        // System.out.println(xml);
                        break;
                    }
                }
                if (!ok) {
                    logger.warn("Parse page description failure for: {}", url);
                }
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
            return page;
        } catch (Exception e) {
            throw new ServiceException("htmlunit.page.error", e);
        }
    }

    @Override
    public void doFilterInternal(String url, ParseFilterChain filterChain) {
        logger.debug("Invoking {} ...", this.getClass());
        HtmlPage htmlPage = fetchHtmlPage(url);
        ParseCommodity parseCommodity = parseCommodityService.findByProperty("baseUrl", url);
        if (parseCommodity == null) {
            parseCommodity = new ParseCommodity(url);
            // 设置来源分组标识，一般取对应站点主域名即可
            parseCommodity.setSourceType("tmall.com");
            parseCommodity.setUid(url);

        }
        parseCommodity.reset();
        // 只有当前对象设定需要重新解析，才处理相关属性解析保存
        if ("PARSING".equals(parseCommodity.getOperationGroup())) {
            // 重置数据对象，准备数据解析设置
            parseCommodity.reset();
            // 商品名称解析处理
            String title = parseTitle(htmlPage, "//DIV[@id='J_DetailMeta']//DIV[@class='tb-property']" + "//DIV[@class='tb-wrap']//DIV[@class='tb-detail-hd']//H1");
            parseCommodity.setTitle(title);
            // 商品描述

            parseDescription(parseCommodity, htmlPage, "//DIV[@id='description']", "//DIV[@id='description']//IMG", "data-ks-lazyload", ALERT_DOMAIN_INFOS);

            // 橱窗图
            parseWindowImgs(parseCommodity, htmlPage, "//UL[@id='J_UlThumb']//IMG", "//IMG[@id='J_ImgBooth']", "data-lazyload");
            // 检查配置参数是否允许爬取扩展数据
            if (true) {
                Commodity commodity = commodityService.findByProperty("sourceUrl", url);
                if (commodity != null) {
                    // 抓取评论
                }
            }
        }

        // 销售价格属性处理
        parseSalePrice(parseCommodity, htmlPage, "//LI[@id='J_PromoPrice']//SPAN[@class='tm-price']", "//LI[@id='J_StrPriceModBox']//SPAN[@class='tm-price']");
        // 促销口号

        parseSalePrompt(parseCommodity, htmlPage, "//DIV[@id='J_DetailMeta']//DIV[@class='tb-property']" + "//DIV[@class='tb-wrap']//DIV[@class='tb-detail-hd']//P");
        // 库存属性处理
        parseSaleStock(parseCommodity, htmlPage, "//EM[@id='J_EmStock']");

        parseSalePrompt(parseCommodity, htmlPage, "//DIV[@id='J_DetailMeta']//DIV[@class='tb-property']" + "//DIV[@class='tb-wrap']//DIV[@class='tb-detail-hd']//P");

        logger.debug("Saving Parse Commodity: {}", parseCommodity);
        parseCommodity.setUid(url);
        DateTime dTime = new DateTime();
        parseCommodity.setLastFetchTime(dTime.getMillis());
        parseCommodity.setLastFetchTimeLabel(dTime.toString("yyyy-MM-dd HH:mm:ss"));
        parseCommodityService.save(parseCommodity);

    }

    @Override
    public Map<String, Object> parseSimpleData(String url) {
        if (isAcceptUrl(url)) {
            HtmlPage htmlPage = fetchHtmlPage(url);
            String xpath = "//LI[@id='J_PromoPrice']//SPAN[@class='tm-price']";
            String falutPath = "//LI[@id='J_StrPriceModBox']//SPAN[@class='tm-price']";
            // 销售价格属性处理
            Map<String, Object> jsonMap = Maps.newLinkedHashMap();
            String title = parseTitle(htmlPage, "//DIV[@id='J_DetailMeta']//DIV[@class='tb-property']" + "//DIV[@class='tb-wrap']//DIV[@class='tb-detail-hd']//H3");
            jsonMap.put("title", title);
            HtmlElement salePriceNode = htmlPage.getFirstByXPath(xpath);
            String salePrice = "";
            if (salePriceNode == null) {
                salePriceNode = htmlPage.getFirstByXPath(falutPath);
            }
            if (salePriceNode != null) {
                salePrice = salePriceNode.asText();
            }
            salePrice = cleanInvisibleChar(salePrice);
            if (StringUtils.isNotBlank(salePrice)) {
                char c = salePrice.trim().charAt(0);
                if (c > '9' || c < '0') {
                    salePrice = salePrice.substring(1, salePrice.length());
                }
                jsonMap.put("salePrice", salePrice);
            } else {
                return null;
            }

            HtmlElement picNode = htmlPage.getFirstByXPath("//IMG[@id='J_ImgBooth']");
            if (picNode != null) {
                String src = parseImgSrc(url, picNode.getAttribute("data-lazyload"));
                if (StringUtils.isBlank(src)) {
                    src = parseImgSrc(url, picNode.getAttribute("src"));
                    jsonMap.put("pic", src);
                }
            } else {
                return null;
            }

            return jsonMap;
        }
        return null;
    }

    protected Set<String> removeImgSrcUrls() {
        Set<String> imgUrls = Sets.newHashSet();
        imgUrls.add("http://a.tbcdn.cn/kissy/1.0.0/build/imglazyload/spaceball.gif");
        return imgUrls;
    }

    public static void main(String[] args) {
        HtmlunitService.addUrlRule("-http://.*\\.gif");
        HtmlunitService.addUrlRule("-http://.*\\.jpg");
        HtmlunitService.addUrlRule("-http://.*\\.png");
        HtmlunitService.addUrlRule("-http://.*\\.css");
        HtmlunitService.addUrlRule("-http://amos.alicdn.com/online.aw.*");
        HtmlunitService.addUrlRule("+http://.*");
        TMallSingleParseFilter tMallSingleParseFilter = new TMallSingleParseFilter();
        tMallSingleParseFilter.fetchHtmlPage("http://detail.tmall.com/item.htm?id=17358761109");
    }
}
